
library(seqinr)
library(stringr)
library(dplyr)
library(vioplot)
library(gmodels)

file1 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Plasmodium/Prediction_results_File1.txt", header = TRUE, sep = "\t", dec = ".")
file1 <- file1[file1$Residue != "",]
file1$Scores <- as.numeric(word(file1$PTMscores,-1,sep=":"))
file2 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Plasmodium/Prediction_results_File2.txt", header = TRUE, sep = "\t", dec = ".")
file2 <- file2[file2$Residue != "",]
file2$Scores <- as.numeric(word(file2$PTMscores,-1,sep=":"))
file3 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Plasmodium/Prediction_results_File3.txt", header = TRUE, sep = "\t", dec = ".")
file3 <- file3[file3$Residue != "",]
file3$Scores <- as.numeric(word(file3$PTMscores,-1,sep=":"))
file4 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Plasmodium/Prediction_results_File4.txt", header = TRUE, sep = "\t", dec = ".")
file4 <- file4[file4$Residue != "",]
file4$Scores <- as.numeric(word(file4$PTMscores,-1,sep=":"))



AllPlasmodium_Musite <- rbind.data.frame(file1,file2,file3,file4)

TPP_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Plasmodium/TPP_GSB.csv")
MQ_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Plasmodium/MQ_GSB.csv")
MQ_GSB$PROTEIN_LOC <- paste0(MQ_GSB$Protein,"_",MQ_GSB$PROTEIN_POS_NUM)
PD_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Plasmodium/PD_GSB.csv")

AllPlasmodium_Musite$PROTEIN_LOC <- paste0(AllPlasmodium_Musite$ID,"_",AllPlasmodium_Musite$Position)

length(unique(AllPlasmodium_Musite$PROTEIN_LOC))

Musite_TPP <- merge(TPP_GSB,AllPlasmodium_Musite,by="PROTEIN_LOC",all.x = TRUE)
Musite_MQ <- merge(MQ_GSB,AllPlasmodium_Musite,by="PROTEIN_LOC",all.x = TRUE)
Musite_PD <- merge(PD_GSB,AllPlasmodium_Musite,by="PROTEIN_LOC",all.x = TRUE)

Musite_TPP$Pipeline <- "TPP"
Musite_MQ$Pipeline <- "MQ"
Musite_PD$Pipeline <- "PD"

R_Musite_TPP <- dplyr::select(Musite_TPP,c("PROTEIN_LOC","FLR_Adj_Score", "Pipeline", "Scores","cat", "Amino"))
R_Musite_MQ <- dplyr::select(Musite_MQ,c("PROTEIN_LOC","FLR_Adj_Score", "Pipeline", "Scores","cat", "Amino"))
R_Musite_PD <- dplyr::select(Musite_PD,c("PROTEIN_LOC","FLR_Adj_Score", "Pipeline", "Scores","cat", "Amino"))


All_Results_Musite <- rbind.data.frame(R_Musite_TPP,R_Musite_MQ,R_Musite_PD)
All_Results_Musite_NA <- All_Results_Musite[!is.na(All_Results_Musite$Scores),]


Musite_NA <- All_Results_Musite[is.na(All_Results_Musite$Scores),]

CrossTable(Musite_NA$Pipeline,Musite_NA$Amino)


pairwise.wilcox.test(All_Results_Musite_NA$Scores, All_Results_Musite_NA$Pipeline, p.adjust.method = "bonferroni",
                     paired = FALSE)

colors = c(rep("#FFE0B2",1),rep("#FFA726",1),rep("#F57C00",1))

boxplot(Scores~Pipeline,data=All_Results_Musite,col=colors)



PXD001684A_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD001684A_pform.csv')
PXD002266A_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD002266A_pform.csv')
PXD005207A_1_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_1_pform.csv')
PXD005207A_2_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_2_pform.csv')
PXD005207A_3_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_3_pform.csv')
PXD005207A_4_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_4_pform.csv')
PXD005207A_5_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_5_pform.csv')
PXD005207A_6_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_6_pform.csv')
PXD005207A_7_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_7_pform.csv')
PXD005207A_8_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_8_pform.csv')
PXD005207A_9_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_9_pform.csv')
PXD005207A_10_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_10_pform.csv')
PXD005207A_11_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD005207A_11_pform.csv')
PXD009157A_1_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD009157A_1_pform.csv')
PXD009157A_2_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD009157A_2_pform.csv')
PXD009465A_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD009465A_pform.csv')
PXD026474A_1_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD026474A_1_pform.csv')
PXD026474A_2_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Plasmodium/pform/TPP_PXD026474A_2_pform.csv')

PXD001864_MQ <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD001684A_pform.csv")
PXD002266_MQ <- read.csv(file = "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD002266A_pform.csv")
PXD005207_MQ <- read.csv(file = "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD005207A_pform.csv")
PXD009157A_1_MQ <- read.csv(file = "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD009157A_1_pform.csv")
PXD009157A_2_MQ <- read.csv(file = "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD009157A_2_pform.csv")
PXD009465_MQ <- read.csv(file = "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD009465A_pform.csv")
PXD026474_MQ <- read.csv(file = "D:/Pipeline comparisons/Writing/Data/MQ/Plasmodium/pform/MQ_PXD026474A_pform.csv")

PXD001864_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD001684A_pform.csv")
PXD002266_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD002266A_pform.csv")
PXD005207_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD005207A_pform.csv")
PXD009157A_1_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD009157A_1_pform.csv")
PXD009157A_2_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD009157A_2_pform.csv")
PXD009465_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD009465A_PSMSITE.csv")
PXD026474_PD <- read.csv(file =  "D:/Pipeline comparisons/Writing/Data/PD/Plasmodium/pform/PD_PXD026474A_pform.csv")

PXD001684A_TPP_Red <- dplyr::select(PXD001684A_TPP , c("PROTEIN_LOC"))
PXD002266A_TPP_Red <- dplyr::select(PXD002266A_TPP , c("PROTEIN_LOC"))
PXD005207A_1_TPP_Red <- dplyr::select(PXD005207A_1_TPP , c("PROTEIN_LOC"))
PXD005207A_2_TPP_Red <- dplyr::select(PXD005207A_2_TPP , c("PROTEIN_LOC"))
PXD005207A_3_TPP_Red <- dplyr::select(PXD005207A_3_TPP , c("PROTEIN_LOC"))
PXD005207A_4_TPP_Red <- dplyr::select(PXD005207A_4_TPP , c("PROTEIN_LOC"))
PXD005207A_5_TPP_Red <- dplyr::select(PXD005207A_5_TPP , c("PROTEIN_LOC"))
PXD005207A_6_TPP_Red <- dplyr::select(PXD005207A_6_TPP , c("PROTEIN_LOC"))
PXD005207A_7_TPP_Red <- dplyr::select(PXD005207A_7_TPP , c("PROTEIN_LOC"))
PXD005207A_8_TPP_Red <- dplyr::select(PXD005207A_8_TPP , c("PROTEIN_LOC"))
PXD005207A_9_TPP_Red <- dplyr::select(PXD005207A_9_TPP , c("PROTEIN_LOC"))
PXD005207A_10_TPP_Red <- dplyr::select(PXD005207A_10_TPP , c("PROTEIN_LOC"))
PXD005207A_11_TPP_Red <- dplyr::select(PXD005207A_11_TPP , c("PROTEIN_LOC"))
PXD009157A_1_TPP_Red <- dplyr::select(PXD009157A_1_TPP , c("PROTEIN_LOC"))
PXD009157A_2_TPP_Red <- dplyr::select(PXD009157A_2_TPP , c("PROTEIN_LOC"))
PXD009465A_TPP_Red <- dplyr::select(PXD009465A_TPP , c("PROTEIN_LOC"))
PXD026474A_1_TPP_Red <- dplyr::select(PXD026474A_1_TPP , c("PROTEIN_LOC"))
PXD026474A_2_TPP_Red <- dplyr::select(PXD026474A_2_TPP , c("PROTEIN_LOC"))

PXD001864_MQ_Red <- dplyr::select(PXD001864_MQ , c("PROTEIN_LOC"))
PXD002266_MQ_Red <- dplyr::select(PXD002266_MQ , c("PROTEIN_LOC"))
PXD005207_MQ_Red <- dplyr::select(PXD005207_MQ , c("PROTEIN_LOC"))
PXD009157A_1_MQ_Red <- dplyr::select(PXD009157A_1_MQ , c("PROTEIN_LOC"))
PXD009157A_2_MQ_Red <- dplyr::select(PXD009157A_2_MQ , c("PROTEIN_LOC"))
PXD009465_MQ_Red <- dplyr::select(PXD009465_MQ , c("PROTEIN_LOC"))
PXD026474_MQ_Red <- dplyr::select(PXD026474_MQ , c("PROTEIN_LOC"))

PXD001864_PD_Red <- dplyr::select(PXD001864_PD , c("PROTEIN_LOC"))
PXD002266_PD_Red <- dplyr::select(PXD002266_PD , c("PROTEIN_LOC"))
PXD005207_PD_Red <- dplyr::select(PXD005207_PD , c("PROTEIN_LOC"))
PXD009157A_1_PD_Red <- dplyr::select(PXD009157A_1_PD , c("PROTEIN_LOC"))
PXD009157A_2_PD_Red <- dplyr::select(PXD009157A_2_PD , c("PROTEIN_LOC"))
PXD009465_PD_Red <- dplyr::select(PXD009465_PD , c("PROTEIN_LOC"))
PXD026474_PD_Red <- dplyr::select(PXD026474_PD , c("PROTEIN_LOC"))


All_TPP <- rbind.data.frame(PXD001684A_TPP_Red,PXD002266A_TPP_Red,PXD005207A_1_TPP_Red,PXD005207A_2_TPP_Red,PXD005207A_3_TPP_Red,
                            PXD005207A_4_TPP_Red,PXD005207A_5_TPP_Red,PXD005207A_6_TPP_Red,PXD005207A_7_TPP_Red,PXD005207A_8_TPP_Red,
                            PXD005207A_9_TPP_Red,PXD005207A_10_TPP_Red,PXD005207A_11_TPP_Red,PXD009157A_1_TPP_Red,
                            PXD009157A_2_TPP_Red,PXD009465A_TPP_Red,PXD026474A_1_TPP_Red,PXD026474A_2_TPP_Red)
All_MQ <- rbind.data.frame(PXD001864_MQ_Red,PXD002266_MQ_Red,PXD005207_MQ_Red,PXD009157A_1_MQ_Red,PXD009157A_2_MQ_Red,
                           PXD009465_MQ_Red,PXD026474_MQ_Red)
All_PD <- rbind.data.frame(PXD001864_PD_Red,PXD002266_PD_Red,PXD005207_PD_Red,PXD009157A_1_PD_Red,PXD009157A_2_PD_Red,
                           PXD009465_PD_Red,PXD026474_PD_Red)

All_TPP$Pipeline <-"TPP"
All_MQ$Pipeline <-"MQ"
All_PD$Pipeline <-"PD"

All_TPP$cat <-"Not chosen"
All_MQ$cat <-"Not chosen"
All_PD$cat <-"Not chosen"

All_Data <- rbind.data.frame(All_TPP,All_MQ,All_PD)

All_Data_NC <- All_Data[! All_Data$PROTEIN_LOC %in% All_Results_Musite_NA$PROTEIN_LOC,]

All_Data_NC = All_Data_NC[!duplicated(All_Data_NC$PROTEIN_LOC),]

Musite_All_Data_NC <- merge(All_Data_NC,AllPlasmodium_Musite,by="PROTEIN_LOC",all.x = TRUE)

Musite_All_Data_NC <-Musite_All_Data_NC[!is.na(Musite_All_Data_NC$Scores),]


FinalSet <- rbind.data.frame(dplyr::select(All_Results_Musite_NA,c("Pipeline","cat","Scores")),dplyr::select(Musite_All_Data_NC,c("Pipeline","cat","Scores")))

FinalSet$Pipeline <- factor(FinalSet$Pipeline , levels=c("MQ","PD","TPP"))
FinalSet$cat <- factor(FinalSet$cat , levels=c("Gold", "Silver", "Bronze", "Not chosen"))
colors = c(rep("#FFE0B2",4),rep("#FFA726",4),rep("#F57C00",4))

boxplot(Scores~cat*Pipeline,data=FinalSet,col=colors)

FinalSet$Group <- paste0(FinalSet$Pipeline,".",FinalSet$cat)

pairwise.wilcox.test(FinalSet$Scores, FinalSet$Group, p.adjust.method = "bonferroni",
                     paired = FALSE)

